{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from selenium import webdriver \n",
    "import  time \n",
    "import requests\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-89ffedf7a14e>:7: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get('https://www.cnki.net/')\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 查看是否登陆成功"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_id('Ecp_loginShowName1').get_attribute('innerHTML')\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击高级检索（此时打开两个页面，需要切换窗口）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"highSearch\"]').click()\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-2EC916A1403D8D9BE52210EFB3455A27', 'CDwindow-B5C0CA4626C5F20F675C8E0F50CB5268']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)\n",
    "driver.switch_to.window(driver.window_handles[-1])\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 点击期刊"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/ul[1]/li[1]/a').click()\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选择核心期刊，C刊进行勾选"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[5]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div/label[6]/input').click()\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 填写检索的关键信息，进行检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"gradetxt\"]/dd[1]/div[2]/input').send_keys(\"数据挖掘\")\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('/html/body/div[2]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 用selenium寻找元素（属性、selector、xpath）按照发表时间的顺序进行检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#driver.find_element_by_xpath('//*[@id=\"orderList\"]/li[2]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#driver.find_element_by_xpath('/html/body/div[4]/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/input').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_class_name('pagerTitleCell').get_attribute('innerHTML')\n",
    "time.sleep(0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "455\n",
      "9086\n",
      "<class 'int'>\n"
     ]
    }
   ],
   "source": [
    "results = driver.find_element_by_class_name('pagerTitleCell').find_element_by_tag_name('em').text\n",
    "all_re = int(results.replace(\",\",\"\"))\n",
    "num_results = int(all_re)\n",
    "num_pages = int(num_results/20) + 1\n",
    "print(num_pages)\n",
    "print(num_results)\n",
    "print(type(num_results))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 爬取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/div/i')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "element=driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/ul/li[3]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>基于数据挖掘的宫颈癌动物模型应用分析  网络首发</td>\n",
       "      <td>韩艳珍; 白明; 康乐; 张瑾; 苗明三</td>\n",
       "      <td>中药药理与临床</td>\n",
       "      <td>2021-06-09 16:07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>基于深度学习的知识追踪研究进展  网络首发</td>\n",
       "      <td>刘铁园; 陈威; 常亮; 古天龙</td>\n",
       "      <td>计算机研究与发展</td>\n",
       "      <td>2021-06-09 15:24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>基于数据挖掘的中药治疗宫颈癌用药规律分析  网络首发</td>\n",
       "      <td>兰俊;綦向军;莫嘉浩;李丹云;吴金凤</td>\n",
       "      <td>中药药理与临床</td>\n",
       "      <td>2021-06-09 15:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>脉络学说理论分析不稳定型心绞痛医案证治规律及网络药理学机制分析  网络首发</td>\n",
       "      <td>王康;李雅文;常丽萍;尹玉洁;朱垚</td>\n",
       "      <td>中国实验方剂学杂志</td>\n",
       "      <td>2021-06-09 14:28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>基于数据挖掘的慢性不可预知温和刺激抑郁症动物模型特点分析  网络首发</td>\n",
       "      <td>刘慧娟;康乐;留甜甜;乔靖怡;白明</td>\n",
       "      <td>中药药理与临床</td>\n",
       "      <td>2021-06-09 10:36</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>古代医籍中治疗筋病内服药规律的数据挖掘研究  网络首发</td>\n",
       "      <td>覃堃; 施展</td>\n",
       "      <td>中药药理与临床</td>\n",
       "      <td>2021-06-08 17:51</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>企业战略性知识轮廓:化工专利数据挖掘与分析  网络首发</td>\n",
       "      <td>王江; 郭鑫彬</td>\n",
       "      <td>情报杂志</td>\n",
       "      <td>2021-06-08 17:45</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>基于代价敏感卷积神经网络的非平衡问题混合方法  网络首发</td>\n",
       "      <td>黄颖琦; 陈红梅</td>\n",
       "      <td>计算机科学</td>\n",
       "      <td>2021-06-08 15:20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>岩石地球化学数据挖掘及弱异常识别——以新疆阿舍勒铜矿为例  网络首发</td>\n",
       "      <td>郑超杰;刘攀峰;罗先熔;文美兰;黄文斌</td>\n",
       "      <td>大地构造与成矿学</td>\n",
       "      <td>2021-06-08 11:09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>基于中医传承辅助平台分析中医药治疗尿源性脓毒症的证治规律</td>\n",
       "      <td>姚晓彬</td>\n",
       "      <td>中药新药与临床药理</td>\n",
       "      <td>2021-06-08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>文本与数据挖掘对著作权例外体系的冲击与应对  网络首发</td>\n",
       "      <td>马治国; 赵龙</td>\n",
       "      <td>西北师大学报(社会科学版)</td>\n",
       "      <td>2021-06-07 17:24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>44.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>Grover算法改进与应用综述  网络首发</td>\n",
       "      <td>刘晓楠; 宋慧超; 王洪; 江舵; 安家乐</td>\n",
       "      <td>计算机科学</td>\n",
       "      <td>2021-06-07 14:53</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>甘松中医药用药规律的数据挖掘</td>\n",
       "      <td>饶瑶;李冉;王晓雯;薛变霞;李世伟</td>\n",
       "      <td>中草药</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>14</td>\n",
       "      <td>利用转录组数据挖掘东紫苏单萜生物合成相关基因</td>\n",
       "      <td>耿秀文; 张爱丽; 唐仁华; 普春霞</td>\n",
       "      <td>中草药</td>\n",
       "      <td>2021-06-07</td>\n",
       "      <td>NaN</td>\n",
       "      <td>33.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>15</td>\n",
       "      <td>论坛情感挖掘研究综述：现状、挑战与趋势  网络首发</td>\n",
       "      <td>陈迪;程朗;王志锋;熊锦鹏;张玉茹</td>\n",
       "      <td>计算机工程与应用</td>\n",
       "      <td>2021-06-03 16:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>228.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>16</td>\n",
       "      <td>基于CNKI文献计量分析的过程挖掘研究评述与展望</td>\n",
       "      <td>花龙雪; 吴应良</td>\n",
       "      <td>管理学报</td>\n",
       "      <td>2021-06-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>254.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>17</td>\n",
       "      <td>基于数据挖掘技术的钛合金铣削工艺参数优化  网络首发</td>\n",
       "      <td>刘献礼; 孙庆贞; 岳彩旭; 李恒帅</td>\n",
       "      <td>计算机集成制造系统</td>\n",
       "      <td>2021-05-31 17:09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>122.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>18</td>\n",
       "      <td>现代名老中医医案数据库的构建与数据处理</td>\n",
       "      <td>赵泽鹏; 戴国华; 高武霖</td>\n",
       "      <td>中医杂志</td>\n",
       "      <td>2021-05-31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>139.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>19</td>\n",
       "      <td>基于数据挖掘的冷藏陈列柜的负荷预测研究</td>\n",
       "      <td>袁培;雷正霖;曾庆辉;武宜霄;吕彦力</td>\n",
       "      <td>流体机械</td>\n",
       "      <td>2021-05-30</td>\n",
       "      <td>NaN</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>20</td>\n",
       "      <td>基于LSTM的飞行数据挖掘模型构建方法研究  网络首发</td>\n",
       "      <td>王志刚; 王业光; 杨宁; 米禹丰; 曲晓雷</td>\n",
       "      <td>航空学报</td>\n",
       "      <td>2021-05-27 13:49</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>21</td>\n",
       "      <td>轨迹数据的时空模式挖掘与管理决策研究综述  网络首发</td>\n",
       "      <td>孙爽; 陈燕; 朴在吉; 张金松</td>\n",
       "      <td>计算机工程与应用</td>\n",
       "      <td>2021-05-27 11:44</td>\n",
       "      <td>NaN</td>\n",
       "      <td>78.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>22</td>\n",
       "      <td>基于新闻大数据的北极地区地缘关系研究</td>\n",
       "      <td>李萌;袁文;袁武;牛方曲;李汉青</td>\n",
       "      <td>地理学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>146.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>23</td>\n",
       "      <td>基于数据挖掘的中药治疗鼻咽癌的用药规律分析</td>\n",
       "      <td>方彩珊;綦向军;萧韵婷;莫嘉浩;廖梦颖</td>\n",
       "      <td>中药新药与临床药理</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>197.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>24</td>\n",
       "      <td>基于数据挖掘方法分析曹志群治疗慢性萎缩性胃炎的用药规律</td>\n",
       "      <td>战俊邑;张新;丁楠;高慧;严如根</td>\n",
       "      <td>中药新药与临床药理</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>138.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>25</td>\n",
       "      <td>基于渔船AIS数据的南海北部海洋渔业捕捞强度空间特征挖掘</td>\n",
       "      <td>李晓恩;周亮;肖杨;吴文周;苏奋振</td>\n",
       "      <td>地球信息科学学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>67.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>26</td>\n",
       "      <td>基于DTW与K-means算法的河北场雨及雨型分区特征研究</td>\n",
       "      <td>李雨欣;王瑛;马庆媛;刘天雪;司丽丽</td>\n",
       "      <td>地球信息科学学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>80.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>27</td>\n",
       "      <td>数据挖掘技术在肺癌危险度预测模型中的应用</td>\n",
       "      <td>高孜博;李迪;段书音;周晓蕾;刘红</td>\n",
       "      <td>肿瘤防治研究</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>28</td>\n",
       "      <td>基于大数据和数据挖掘下的商务与经济统计——评《商务与经济统计》</td>\n",
       "      <td>孙玲</td>\n",
       "      <td>热带作物学报</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29</td>\n",
       "      <td>打造教育人工智能大脑:教育数据中台技术实现路径</td>\n",
       "      <td>李爱霞; 舒杭; 顾小清</td>\n",
       "      <td>开放教育研究</td>\n",
       "      <td>2021-05-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>283.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>30</td>\n",
       "      <td>MGLL在前列腺癌组织中的表达及其对癌细胞生长的调控作用</td>\n",
       "      <td>陈南辉;钟伟枫;潘斌;王晓红;黄志成</td>\n",
       "      <td>暨南大学学报(自然科学与医学版)</td>\n",
       "      <td>2021-05-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>65.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>31</td>\n",
       "      <td>区块链交易网络研究综述  网络首发</td>\n",
       "      <td>吴嘉婧; 刘洁利; 林丹; 郑子彬</td>\n",
       "      <td>中山大学学报(自然科学版)</td>\n",
       "      <td>2021-05-21 16:59</td>\n",
       "      <td>NaN</td>\n",
       "      <td>827.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>32</td>\n",
       "      <td>运用数据挖掘和网络药理学探讨糖尿病认知功能障碍中医用药规律和作用机制  网络首发</td>\n",
       "      <td>石崯力; 王旭; 盛沛; 张擎; 梁婕</td>\n",
       "      <td>天然产物研究与开发</td>\n",
       "      <td>2021-05-20 16:44</td>\n",
       "      <td>NaN</td>\n",
       "      <td>184.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>水电站尾水位特性解析与建模  网络首发</td>\n",
       "      <td>贾本军; 周建中; 陈潇; 张勇传; 田梦琦</td>\n",
       "      <td>水力发电学报</td>\n",
       "      <td>2021-05-20 09:13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>40.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>34</td>\n",
       "      <td>基于数据挖掘和网络药理学的早期DN用药规律及机制分析</td>\n",
       "      <td>曲超;张冰冰;姜楠;张柯欣;石岩</td>\n",
       "      <td>沈阳药科大学学报</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>64.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>35</td>\n",
       "      <td>计算教育学国内发展现状分析与未来展望——基于语言模型和自然语言生成技术</td>\n",
       "      <td>贾维辰; 彭俊; 任英杰</td>\n",
       "      <td>远程教育杂志</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>60.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>解码历史——宜兴丁蜀古南街历史风貌保护与更新中的数字技术与实践</td>\n",
       "      <td>唐芃; 王笑; 华好</td>\n",
       "      <td>建筑学报</td>\n",
       "      <td>2021-05-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>37</td>\n",
       "      <td>人群动态的观测理论及其未来发展思考  网络首发</td>\n",
       "      <td>方志祥</td>\n",
       "      <td>地球信息科学学报</td>\n",
       "      <td>2021-05-19 15:20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>41.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>38</td>\n",
       "      <td>基于数据挖掘的国医大师王琦治疗慢性前列腺炎用药规律研究  网络首发</td>\n",
       "      <td>刘桂敏;汤轶波;白雪;陈亚飞;刘丹</td>\n",
       "      <td>中国中医药信息杂志</td>\n",
       "      <td>2021-05-19 10:21</td>\n",
       "      <td>NaN</td>\n",
       "      <td>148.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>39</td>\n",
       "      <td>计算法学的疆域</td>\n",
       "      <td>季卫东</td>\n",
       "      <td>社会科学辑刊</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>169.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>40</td>\n",
       "      <td>大数据时代高校图书馆智慧服务的逻辑与路径</td>\n",
       "      <td>左平熙</td>\n",
       "      <td>图书馆工作与研究</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>371.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>41</td>\n",
       "      <td>大数据技术视域下智慧图书馆伦理危机与控制研究</td>\n",
       "      <td>陆康; 刘慧; 曹畋</td>\n",
       "      <td>高校图书馆工作</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>76.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>42</td>\n",
       "      <td>基于数据挖掘的中药专利复方治疗慢性胆囊炎的用药规律分析</td>\n",
       "      <td>郝少东; 杨闪闪; 刘彩萍; 李月廷</td>\n",
       "      <td>中国现代应用药学</td>\n",
       "      <td>2021-05-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>38.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>43</td>\n",
       "      <td>面部色素性疾病中药面膜专利处方分析及作用机制探讨  网络首发</td>\n",
       "      <td>李瑶;付浩;李文林;杨丽丽;曾莉</td>\n",
       "      <td>中国皮肤性病学杂志</td>\n",
       "      <td>2021-05-13 15:49</td>\n",
       "      <td>NaN</td>\n",
       "      <td>116.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>学习者与平台交互行为挖掘及学习预测模型构建</td>\n",
       "      <td>王亮</td>\n",
       "      <td>中国远程教育</td>\n",
       "      <td>2021-05-12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>255.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>45</td>\n",
       "      <td>基于人工智能技术的重大活动食品安全与风险评估综述</td>\n",
       "      <td>李晓理; 卜坤; 翟玉鹏; 王康</td>\n",
       "      <td>北京工业大学学报</td>\n",
       "      <td>2021-05-10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>93.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>46</td>\n",
       "      <td>面向智能攻击的行为预测研究</td>\n",
       "      <td>马钰锡; 张全新; 谭毓安; 沈蒙</td>\n",
       "      <td>软件学报</td>\n",
       "      <td>2021-05-09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>226.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>47</td>\n",
       "      <td>轨迹表示学习技术研究进展</td>\n",
       "      <td>曹翰林; 唐海娜; 王飞; 徐勇军</td>\n",
       "      <td>软件学报</td>\n",
       "      <td>2021-05-09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>74.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>48</td>\n",
       "      <td>基于数据挖掘分析《理淪骈文》治疗五官疾病用药规律  网络首发</td>\n",
       "      <td>姜楠; 潘赐明; 韩利震; 李应红; 董昌武</td>\n",
       "      <td>中国中医基础医学杂志</td>\n",
       "      <td>2021-05-07 14:08</td>\n",
       "      <td>NaN</td>\n",
       "      <td>127.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>49</td>\n",
       "      <td>机器学习与针灸学领域结合的研究进展  网络首发</td>\n",
       "      <td>梁吉;韩名媛;王承斌;吕晓琳;孙忠人</td>\n",
       "      <td>针刺研究</td>\n",
       "      <td>2021-05-07 13:12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>161.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>50</td>\n",
       "      <td>基于概率图模型的计算机课程教学认知诊断框架(英文)  网络首发</td>\n",
       "      <td>胡心颖; 何钰; 孙广中</td>\n",
       "      <td>中国科学技术大学学报</td>\n",
       "      <td>2021-05-06 17:09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>67.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                        篇名  \\\n",
       "0            1                  基于数据挖掘的宫颈癌动物模型应用分析  网络首发   \n",
       "1            2                     基于深度学习的知识追踪研究进展  网络首发   \n",
       "2            3                基于数据挖掘的中药治疗宫颈癌用药规律分析  网络首发   \n",
       "3            4     脉络学说理论分析不稳定型心绞痛医案证治规律及网络药理学机制分析  网络首发   \n",
       "4            5        基于数据挖掘的慢性不可预知温和刺激抑郁症动物模型特点分析  网络首发   \n",
       "5            6               古代医籍中治疗筋病内服药规律的数据挖掘研究  网络首发   \n",
       "6            7               企业战略性知识轮廓:化工专利数据挖掘与分析  网络首发   \n",
       "7            8              基于代价敏感卷积神经网络的非平衡问题混合方法  网络首发   \n",
       "8            9        岩石地球化学数据挖掘及弱异常识别——以新疆阿舍勒铜矿为例  网络首发   \n",
       "9           10              基于中医传承辅助平台分析中医药治疗尿源性脓毒症的证治规律   \n",
       "10          11               文本与数据挖掘对著作权例外体系的冲击与应对  网络首发   \n",
       "11          12                     Grover算法改进与应用综述  网络首发   \n",
       "12          13                            甘松中医药用药规律的数据挖掘   \n",
       "13          14                    利用转录组数据挖掘东紫苏单萜生物合成相关基因   \n",
       "14          15                 论坛情感挖掘研究综述：现状、挑战与趋势  网络首发   \n",
       "15          16                  基于CNKI文献计量分析的过程挖掘研究评述与展望   \n",
       "16          17                基于数据挖掘技术的钛合金铣削工艺参数优化  网络首发   \n",
       "17          18                       现代名老中医医案数据库的构建与数据处理   \n",
       "18          19                       基于数据挖掘的冷藏陈列柜的负荷预测研究   \n",
       "19          20               基于LSTM的飞行数据挖掘模型构建方法研究  网络首发   \n",
       "20          21                轨迹数据的时空模式挖掘与管理决策研究综述  网络首发   \n",
       "21          22                        基于新闻大数据的北极地区地缘关系研究   \n",
       "22          23                     基于数据挖掘的中药治疗鼻咽癌的用药规律分析   \n",
       "23          24               基于数据挖掘方法分析曹志群治疗慢性萎缩性胃炎的用药规律   \n",
       "24          25              基于渔船AIS数据的南海北部海洋渔业捕捞强度空间特征挖掘   \n",
       "25          26             基于DTW与K-means算法的河北场雨及雨型分区特征研究   \n",
       "26          27                      数据挖掘技术在肺癌危险度预测模型中的应用   \n",
       "27          28           基于大数据和数据挖掘下的商务与经济统计——评《商务与经济统计》   \n",
       "28          29                   打造教育人工智能大脑:教育数据中台技术实现路径   \n",
       "29          30              MGLL在前列腺癌组织中的表达及其对癌细胞生长的调控作用   \n",
       "30          31                         区块链交易网络研究综述  网络首发   \n",
       "31          32  运用数据挖掘和网络药理学探讨糖尿病认知功能障碍中医用药规律和作用机制  网络首发   \n",
       "32          33                       水电站尾水位特性解析与建模  网络首发   \n",
       "33          34                基于数据挖掘和网络药理学的早期DN用药规律及机制分析   \n",
       "34          35       计算教育学国内发展现状分析与未来展望——基于语言模型和自然语言生成技术   \n",
       "35          36           解码历史——宜兴丁蜀古南街历史风貌保护与更新中的数字技术与实践   \n",
       "36          37                   人群动态的观测理论及其未来发展思考  网络首发   \n",
       "37          38         基于数据挖掘的国医大师王琦治疗慢性前列腺炎用药规律研究  网络首发   \n",
       "38          39                                   计算法学的疆域   \n",
       "39          40                      大数据时代高校图书馆智慧服务的逻辑与路径   \n",
       "40          41                    大数据技术视域下智慧图书馆伦理危机与控制研究   \n",
       "41          42               基于数据挖掘的中药专利复方治疗慢性胆囊炎的用药规律分析   \n",
       "42          43            面部色素性疾病中药面膜专利处方分析及作用机制探讨  网络首发   \n",
       "43          44                     学习者与平台交互行为挖掘及学习预测模型构建   \n",
       "44          45                  基于人工智能技术的重大活动食品安全与风险评估综述   \n",
       "45          46                             面向智能攻击的行为预测研究   \n",
       "46          47                              轨迹表示学习技术研究进展   \n",
       "47          48            基于数据挖掘分析《理淪骈文》治疗五官疾病用药规律  网络首发   \n",
       "48          49                   机器学习与针灸学领域结合的研究进展  网络首发   \n",
       "49          50           基于概率图模型的计算机课程教学认知诊断框架(英文)  网络首发   \n",
       "\n",
       "                        作者                刊名              发表时间  被引     下载  操作  \n",
       "0     韩艳珍; 白明; 康乐; 张瑾; 苗明三           中药药理与临床  2021-06-09 16:07 NaN    NaN  下载  \n",
       "1         刘铁园; 陈威; 常亮; 古天龙          计算机研究与发展  2021-06-09 15:24 NaN    NaN  下载  \n",
       "2       兰俊;綦向军;莫嘉浩;李丹云;吴金凤           中药药理与临床  2021-06-09 15:00 NaN    NaN  下载  \n",
       "3        王康;李雅文;常丽萍;尹玉洁;朱垚         中国实验方剂学杂志  2021-06-09 14:28 NaN    NaN  下载  \n",
       "4        刘慧娟;康乐;留甜甜;乔靖怡;白明           中药药理与临床  2021-06-09 10:36 NaN    NaN  下载  \n",
       "5                   覃堃; 施展           中药药理与临床  2021-06-08 17:51 NaN    7.0  下载  \n",
       "6                  王江; 郭鑫彬              情报杂志  2021-06-08 17:45 NaN    6.0  下载  \n",
       "7                 黄颖琦; 陈红梅             计算机科学  2021-06-08 15:20 NaN   23.0  下载  \n",
       "8      郑超杰;刘攀峰;罗先熔;文美兰;黄文斌          大地构造与成矿学  2021-06-08 11:09 NaN    5.0  下载  \n",
       "9                      姚晓彬         中药新药与临床药理        2021-06-08 NaN   19.0  下载  \n",
       "10                 马治国; 赵龙     西北师大学报(社会科学版)  2021-06-07 17:24 NaN   44.0  下载  \n",
       "11   刘晓楠; 宋慧超; 王洪; 江舵; 安家乐             计算机科学  2021-06-07 14:53 NaN   40.0  下载  \n",
       "12       饶瑶;李冉;王晓雯;薛变霞;李世伟               中草药        2021-06-07 NaN   62.0  下载  \n",
       "13      耿秀文; 张爱丽; 唐仁华; 普春霞               中草药        2021-06-07 NaN   33.0  下载  \n",
       "14       陈迪;程朗;王志锋;熊锦鹏;张玉茹          计算机工程与应用  2021-06-03 16:00 NaN  228.0  下载  \n",
       "15                花龙雪; 吴应良              管理学报        2021-06-01 NaN  254.0  下载  \n",
       "16      刘献礼; 孙庆贞; 岳彩旭; 李恒帅         计算机集成制造系统  2021-05-31 17:09 NaN  122.0  下载  \n",
       "17           赵泽鹏; 戴国华; 高武霖              中医杂志        2021-05-31 NaN  139.0  下载  \n",
       "18      袁培;雷正霖;曾庆辉;武宜霄;吕彦力              流体机械        2021-05-30 NaN    5.0  下载  \n",
       "19  王志刚; 王业光; 杨宁; 米禹丰; 曲晓雷              航空学报  2021-05-27 13:49 NaN  102.0  下载  \n",
       "20        孙爽; 陈燕; 朴在吉; 张金松          计算机工程与应用  2021-05-27 11:44 NaN   78.0  下载  \n",
       "21        李萌;袁文;袁武;牛方曲;李汉青              地理学报        2021-05-25 NaN  146.0  下载  \n",
       "22     方彩珊;綦向军;萧韵婷;莫嘉浩;廖梦颖         中药新药与临床药理        2021-05-25 NaN  197.0  下载  \n",
       "23        战俊邑;张新;丁楠;高慧;严如根         中药新药与临床药理        2021-05-25 NaN  138.0  下载  \n",
       "24       李晓恩;周亮;肖杨;吴文周;苏奋振          地球信息科学学报        2021-05-25 NaN   67.0  下载  \n",
       "25      李雨欣;王瑛;马庆媛;刘天雪;司丽丽          地球信息科学学报        2021-05-25 NaN   80.0  下载  \n",
       "26       高孜博;李迪;段书音;周晓蕾;刘红            肿瘤防治研究        2021-05-25 NaN    2.0  下载  \n",
       "27                      孙玲            热带作物学报        2021-05-25 NaN    NaN  下载  \n",
       "28            李爱霞; 舒杭; 顾小清            开放教育研究        2021-05-24 NaN  283.0  下载  \n",
       "29      陈南辉;钟伟枫;潘斌;王晓红;黄志成  暨南大学学报(自然科学与医学版)        2021-05-24 NaN   65.0  下载  \n",
       "30       吴嘉婧; 刘洁利; 林丹; 郑子彬     中山大学学报(自然科学版)  2021-05-21 16:59 NaN  827.0  下载  \n",
       "31     石崯力; 王旭; 盛沛; 张擎; 梁婕         天然产物研究与开发  2021-05-20 16:44 NaN  184.0  下载  \n",
       "32  贾本军; 周建中; 陈潇; 张勇传; 田梦琦            水力发电学报  2021-05-20 09:13 NaN   40.0  下载  \n",
       "33        曲超;张冰冰;姜楠;张柯欣;石岩          沈阳药科大学学报        2021-05-20 NaN   64.0  下载  \n",
       "34            贾维辰; 彭俊; 任英杰            远程教育杂志        2021-05-20 NaN   60.0  下载  \n",
       "35              唐芃; 王笑; 华好              建筑学报        2021-05-20 NaN    NaN  下载  \n",
       "36                     方志祥          地球信息科学学报  2021-05-19 15:20 NaN   41.0  下载  \n",
       "37       刘桂敏;汤轶波;白雪;陈亚飞;刘丹         中国中医药信息杂志  2021-05-19 10:21 NaN  148.0  下载  \n",
       "38                     季卫东            社会科学辑刊        2021-05-15 NaN  169.0  下载  \n",
       "39                     左平熙          图书馆工作与研究        2021-05-15 NaN  371.0  下载  \n",
       "40              陆康; 刘慧; 曹畋           高校图书馆工作        2021-05-15 NaN   76.0  下载  \n",
       "41      郝少东; 杨闪闪; 刘彩萍; 李月廷          中国现代应用药学        2021-05-15 NaN   38.0  下载  \n",
       "42        李瑶;付浩;李文林;杨丽丽;曾莉         中国皮肤性病学杂志  2021-05-13 15:49 NaN  116.0  下载  \n",
       "43                      王亮            中国远程教育        2021-05-12 NaN  255.0  下载  \n",
       "44        李晓理; 卜坤; 翟玉鹏; 王康          北京工业大学学报        2021-05-10 NaN   93.0  下载  \n",
       "45       马钰锡; 张全新; 谭毓安; 沈蒙              软件学报        2021-05-09 NaN  226.0  下载  \n",
       "46       曹翰林; 唐海娜; 王飞; 徐勇军              软件学报        2021-05-09 NaN   74.0  下载  \n",
       "47  姜楠; 潘赐明; 韩利震; 李应红; 董昌武        中国中医基础医学杂志  2021-05-07 14:08 NaN  127.0  下载  \n",
       "48      梁吉;韩名媛;王承斌;吕晓琳;孙忠人              针刺研究  2021-05-07 13:12 NaN  161.0  下载  \n",
       "49            胡心颖; 何钰; 孙广中        中国科学技术大学学报  2021-05-06 17:09 NaN   67.0  下载  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('gridTable')\n",
    "data_dic = element.get_attribute('innerHTML')\n",
    "pd.read_html(data_dic)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# driver.switch_to.frame()\n",
    "from lxml import etree\n",
    "import re\n",
    "import requests\n",
    "from urllib.parse import urljoin\n",
    "HOST = \"https://kns.cnki.net/\"\n",
    "html = driver.page_source\n",
    "soup = etree.HTML(html)\n",
    "tr_list= soup.xpath('//div[@id=\"gridTable\"]/table/tbody/tr')\n",
    "headers = {\n",
    "    \"Cookie\":'Ecp_ClientId=3200615112602316607; cnkiUserKey=03369524-5653-0508-82c6-2fdbcdb21fe4; RsPerPage=20; _pk_ref=%5B%22%22%2C%22%22%2C1607047617%2C%22https%3A%2F%2Fwww.cnki.net%2F%22%5D; Ecp_ClientIp=202.116.81.140; UM_distinctid=178689abc586d3-0e68ae30691535-5771133-1fa400-178689abc597f5; Ecp_loginuserjf=15014134753; Ecp_session=1; ASP.NET_SessionId=gt3afeursglwft12mavvuvzf; SID_kns8=123111; SID_kns_new=kns123106; CurrSortFieldType=desc; SID_kcms=124103; SID_kxreader_new=011121; Hm_lvt_6e967eb120601ea41b9d312166416aa6=1621474773; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22179876d894d472-0e2dc24144c1fc-2363163-2073600-179876d894e760%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22%24device_id%22%3A%22179876d894d472-0e2dc24144c1fc-2363163-2073600-179876d894e760%22%7D; Hm_lvt_ba7af201fc75865e9846f701ccb53e6b=1621474773; SID_kns=025123113; SID_klogin=125144; Hm_lpvt_6e967eb120601ea41b9d312166416aa6=1621475961; Hm_lpvt_ba7af201fc75865e9846f701ccb53e6b=1621475961; Ecp_loginuserbk=GDZSDX; knsLeftGroupSelectItem=null5%3B9%3B; CurrSortField=%e8%a2%ab%e5%bc%95%2f(%e8%a2%ab%e5%bc%95%e9%a2%91%e6%ac%a1%2c%27integer%27); _pk_ref=%5B%22%22%2C%22%22%2C1622444666%2C%22https%3A%2F%2Fwww.cnki.net%2F%22%5D; _pk_ses=*; _pk_id=90adc29b-dc70-48ff-8dcc-60e442d5a980.1619271339.5.1622444668.1622444666.; Ecp_LoginStuts={\"IsAutoLogin\":false,\"UserName\":\"GZ0513\",\"ShowName\":\"%e4%b8%ad%e5%b1%b1%e5%a4%a7%e5%ad%a6%e5%8d%97%e6%96%b9%e5%ad%a6%e9%99%a2\",\"UserType\":\"bk\",\"BUserName\":\"\",\"BShowName\":\"\",\"BUserType\":\"\",\"r\":\"4SVIml\"}; LID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVdSTUtBeko5WUtLcE8xUldvbWhxRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!; c_m_LinID=LinID=WEEvREcwSlJHSldSdmVqM1BLVW9SQVdSTUtBeko5WUtLcE8xUldvbWhxRT0=$9A4hF_YAuvQ5obgVAqNKPCYcEjKensW4IQMovwHtwkF4VYPoHbKxJw!!&ot=05/31/2021 15:24:33; c_m_expire=2021-05-31 15:24:33',\n",
    "    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'\n",
    "}\n",
    "def get_data(href):\n",
    "    DbCode = re.findall(\"DbCode=(.*?)&\",href)[0]\n",
    "    dbname = re.findall(\"dbname=(.*?)&\",href)[0]\n",
    "    filename = re.findall(\"filename=(.*?)&\",href)[0]\n",
    "    href = f\"https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={DbCode}&dbname={dbname}&filename={filename}\"\n",
    "    return href\n",
    "\n",
    "def download(href):\n",
    "    page_source = requests.get(url=href,headers=headers).text\n",
    "    page_source_soup = etree.HTML(page_source)\n",
    "    download_url = page_source_soup.xpath('//li[@class=\"btn-dlpdf\"]/a/@href')\n",
    "    if download_url:\n",
    "        return urljoin(HOST,download_url[0])\n",
    "    else:\n",
    "        return \"没有下载链接\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def downloader(title,href):\n",
    "    status_code = requests.get(href,headers=headers)\n",
    "    if status_code == 200:\n",
    "        contends = status_code.content\n",
    "        with open(f\"{title}.pdf\",\"wb\") as fp:\n",
    "            fp.wirte(contents)\n",
    "        print(f\"{title}\",\"成功下载\")\n",
    "    else:\n",
    "        print(\"无法下载\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_pages = 20\n",
    "\n",
    "\n",
    "title_list = []\n",
    "author_list = []\n",
    "href_list = []\n",
    "download_url_list = []\n",
    "\n",
    "def main():\n",
    "    now_page_count = 1\n",
    "    while now_page_count < num_pages:\n",
    "        html = driver.page_source\n",
    "        try:\n",
    "            soup = etree.HTML(html)\n",
    "            tr_list= soup.xpath('//div[@id=\"gridTable\"]/table/tbody/tr')\n",
    "            for tr in tr_list:\n",
    "                title = \"\".join(tr.xpath('td[@class=\"name\"]//text()')).strip().replace(\"\\n\",\"\").replace(\"                              网络首发\",\"\")\n",
    "                author = \",\".join(tr.xpath('td[@class=\"author\"]/a//text()')).strip().replace(\"\\n\",\"\")\n",
    "                href = tr.xpath('td[@class=\"name\"]/a/@href')[0]\n",
    "                href = get_data(href)\n",
    "                download_url = download(href)\n",
    "#                 print(download_url)\n",
    "                title_list.append(title)\n",
    "                author_list.append(author)\n",
    "                href_list.append(href)\n",
    "                download_url_list.append(download_url)\n",
    "            now_page_count = now_page_count + 1\n",
    "            driver.find_element_by_xpath('//*[@id=\"PageNext\"]').click()\n",
    "            time.sleep(0.5)\n",
    "        except:\n",
    "            time.sleep(10)# 解决500多遇到验证码问题\n",
    "    data = {\"标题\":title_list,\"作者\":author_list,\"链接\":href_list,\"下载链接\":download_url_list}\n",
    "    data = pd.DataFrame(data)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>标题</th>\n",
       "      <th>作者</th>\n",
       "      <th>链接</th>\n",
       "      <th>下载链接</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>基于数据挖掘的宫颈癌动物模型应用分析</td>\n",
       "      <td>韩艳珍,白明,康乐,张瑾,苗明三</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>基于深度学习的知识追踪研究进展</td>\n",
       "      <td>刘铁园,陈威,常亮,古天龙</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>基于数据挖掘的中药治疗宫颈癌用药规律分析</td>\n",
       "      <td>兰俊,綦向军,莫嘉浩,李丹云,吴金凤</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>脉络学说理论分析不稳定型心绞痛医案证治规律及网络药理学机制分析</td>\n",
       "      <td>王康,李雅文,常丽萍,尹玉洁,朱垚</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>基于数据挖掘的慢性不可预知温和刺激抑郁症动物模型特点分析</td>\n",
       "      <td>刘慧娟,康乐,留甜甜,乔靖怡,白明</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>945</th>\n",
       "      <td>文本与数据挖掘合理使用例外规范的体系化设置</td>\n",
       "      <td>杨娟</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>946</th>\n",
       "      <td>政治关联能持续改善企业绩效吗——企业发展阶段与行业环境的作用研究</td>\n",
       "      <td>郭海,王超,房唯佳</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>947</th>\n",
       "      <td>基于数据挖掘分析神经元正五聚体蛋白2在肾透明细胞癌中的表达及意义</td>\n",
       "      <td>林晏廷,徐季旗,陈洁</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>948</th>\n",
       "      <td>创业竞赛对提升学生创新创业能力的影响——基于创业竞赛参赛意愿调查问卷的数据挖掘分析</td>\n",
       "      <td>宫毅敏,林镇国</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>949</th>\n",
       "      <td>基于邻节点和关系模型优化的网络表示学习</td>\n",
       "      <td>冶忠林,赵海兴,张科,朱宇,肖玉芝</td>\n",
       "      <td>https://kns.cnki.net/kcms/detail/detail.aspx?d...</td>\n",
       "      <td>https://kns.cnki.net/kcms/download.aspx?filena...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>950 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            标题                  作者  \\\n",
       "0                           基于数据挖掘的宫颈癌动物模型应用分析    韩艳珍,白明,康乐,张瑾,苗明三   \n",
       "1                              基于深度学习的知识追踪研究进展       刘铁园,陈威,常亮,古天龙   \n",
       "2                         基于数据挖掘的中药治疗宫颈癌用药规律分析  兰俊,綦向军,莫嘉浩,李丹云,吴金凤   \n",
       "3              脉络学说理论分析不稳定型心绞痛医案证治规律及网络药理学机制分析   王康,李雅文,常丽萍,尹玉洁,朱垚   \n",
       "4                 基于数据挖掘的慢性不可预知温和刺激抑郁症动物模型特点分析   刘慧娟,康乐,留甜甜,乔靖怡,白明   \n",
       "..                                         ...                 ...   \n",
       "945                      文本与数据挖掘合理使用例外规范的体系化设置                  杨娟   \n",
       "946           政治关联能持续改善企业绩效吗——企业发展阶段与行业环境的作用研究           郭海,王超,房唯佳   \n",
       "947           基于数据挖掘分析神经元正五聚体蛋白2在肾透明细胞癌中的表达及意义          林晏廷,徐季旗,陈洁   \n",
       "948  创业竞赛对提升学生创新创业能力的影响——基于创业竞赛参赛意愿调查问卷的数据挖掘分析             宫毅敏,林镇国   \n",
       "949                        基于邻节点和关系模型优化的网络表示学习   冶忠林,赵海兴,张科,朱宇,肖玉芝   \n",
       "\n",
       "                                                    链接  \\\n",
       "0    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "1    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "2    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "3    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "4    https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "..                                                 ...   \n",
       "945  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "946  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "947  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "948  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "949  https://kns.cnki.net/kcms/detail/detail.aspx?d...   \n",
       "\n",
       "                                                  下载链接  \n",
       "0    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "1    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "2    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "3    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "4    https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "..                                                 ...  \n",
       "945  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "946  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "947  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "948  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "949  https://kns.cnki.net/kcms/download.aspx?filena...  \n",
       "\n",
       "[950 rows x 4 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "data = main()\n",
    "display(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_excel('CNKI_数据挖掘.xlsx',sheet_name=\"950篇\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
